Importing Libraries

library(ggplot2)
library(tidyverse)
library(dplyr)
library(corrplot)
library(naniar)
library(plotly)
library(data.table)           #for fread()
library(plyr)
library(scales)               #to overrride the default breaks, labels
library(cowplot)
library(lubridate)
library(crosstable)
library(kableExtra)
library(DT)
library(caret)                #for createdatapartition()
library(forcats)
library(TTR)                  #for SMA() 
library(tidyr)
library(jtools)               #for effect_plot()      

1. About COVID dataset

This COVID dataset is obtained from Our World in Data github repository. They continuously update the COVID dataset. This dataset contains 67 variables and 253472 observations but I will be only using few of the variables for the analysis.

covid.data <- fread("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

str(covid.data)
## Classes 'data.table' and 'data.frame':   254169 obs. of  67 variables:
##  $ iso_code                                  : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ continent                                 : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ location                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date                                      : IDate, format: "2020-02-24" "2020-02-25" ...
##  $ total_cases                               : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ new_cases                                 : num  5 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed                        : num  NA NA NA NA NA 0.714 0.714 0 0 0 ...
##  $ total_deaths                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths                                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_cases_per_million                   : num  0.122 0.122 0.122 0.122 0.122 0.122 0.122 0.122 0.122 0.122 ...
##  $ new_cases_per_million                     : num  0.122 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed_per_million            : num  NA NA NA NA NA 0.017 0.017 0 0 0 ...
##  $ total_deaths_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_per_million                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed_per_million           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ reproduction_rate                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients_per_million                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions_per_million         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions_per_million        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests                               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests                                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests_per_thousand                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_per_thousand                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed_per_thousand           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ positive_rate                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_per_case                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_units                               : chr  "" "" "" "" ...
##  $ total_vaccinations                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_vaccinations_per_hundred            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated_per_hundred             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated_per_hundred       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters_per_hundred                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed_per_million     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed_per_hundred: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stringency_index                          : num  8.33 8.33 8.33 8.33 8.33 ...
##  $ population_density                        : num  54.4 54.4 54.4 54.4 54.4 ...
##  $ median_age                                : num  18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
##  $ aged_65_older                             : num  2.58 2.58 2.58 2.58 2.58 ...
##  $ aged_70_older                             : num  1.34 1.34 1.34 1.34 1.34 ...
##  $ gdp_per_capita                            : num  1804 1804 1804 1804 1804 ...
##  $ extreme_poverty                           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cardiovasc_death_rate                     : num  597 597 597 597 597 ...
##  $ diabetes_prevalence                       : num  9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
##  $ female_smokers                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ male_smokers                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ handwashing_facilities                    : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ hospital_beds_per_thousand                : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ life_expectancy                           : num  64.8 64.8 64.8 64.8 64.8 ...
##  $ human_development_index                   : num  0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
##  $ population                                : num  41128772 41128772 41128772 41128772 41128772 ...
##  $ excess_mortality_cumulative_absolute      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative_per_million   : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>

2. Data Cleaning

2.1 Missing Data?

I used naniar package gg_miss_var() function to visualize the missing data in whole dataframe.

#checking if there are missing data
gg_miss_var(covid.data)

2.2 Treating the missing data

Missing data cannot be neglected. Hence, I replaced the missing data with 0 and mean of respective variables. Since, I won’t use all the variables, I only replace the missing data in variables that I am likely to use in further steps. For the variables of which missing data to be replaced with 0, I replaced the NAs directly without using group_by(location) while for other of which NAs to be replaced with mean, I used group_by(location) first.

covid.data1 <- covid.data %>%
  mutate_at(vars(new_cases, total_cases, new_deaths, total_deaths, tests_per_case, new_tests, reproduction_rate, icu_patients, hosp_patients, people_fully_vaccinated),
            ~replace_na(.,
                        0)) %>%
  group_by(location) %>%
  mutate(human_development_index = replace_na(human_development_index, mean(human_development_index, na.rm = T)),
         population_density = replace_na(population_density, mean(population_density, na.rm = T)),
         gdp_per_capita = replace_na(gdp_per_capita, mean(gdp_per_capita, na.rm = T)),
         extreme_poverty = replace_na(extreme_poverty, mean(extreme_poverty, na.rm = T)),
         male_smokers = replace_na(male_smokers, mean(male_smokers, na.rm = T)),
         female_smokers = replace_na(female_smokers, mean(female_smokers, na.rm = T)),
         icu_patients = replace_na(icu_patients, mean(icu_patients, na.rm = T)),
         hosp_patients = replace_na(hosp_patients, mean(hosp_patients, na.rm = T)),
         handwashing_facilities = replace_na(handwashing_facilities, mean(handwashing_facilities, na.rm = T)),
         median_age = replace_na(median_age, mean(median_age, na.rm = T)),
         stringency_index = replace_na(stringency_index, mean(stringency_index, na.rm = T))) %>%
  ungroup()

2.3 Date column into date format

covid.data1$date <- as.Date(covid.data1$date, format = "%Y-%m-%d")

2.4 Cleaning continent and location column

While looking at the dataset, I found out that the location column has continents also which must be removed.

#Removing continents from country column
continents <- c("Asia", "Africa", "European Union", "Europe", "High income", "Lower middle income", "Low Income", "Upper middle income", 'Oceania', "South America", "North America", "International", "World")
covid <- subset(covid.data1, !(location %in% continents))

#Removing the blank spaces from continent column
covid <- subset(covid, !(continent == ""))

3. Data Manipulation and Visualizations

3.1 Countrywise COVID database

#creating month column
year.month <- covid %>%
  mutate(Month = as.character(lubridate::month(date)),
         Year = lubridate::year(date)) %>%
  select(continent, location, Year, Month, new_cases, new_deaths)

#recoding the month column
year.month$Month <- recode(year.month$Month, 
                      "1" = "January",
                      "2" = "February",
                      "3" = "March",
                      "4" = "April",
                      "5" = "May",
                      "6" = "June", 
                      "7" = "July",
                      "8" = "August",
                      "9" = "September",
                      "10" = "October",
                      "11" = "November",
                      "12" = "December")

covid_summary_statistics <- year.month %>%
  group_by(location, Month, Year) %>%
  dplyr::summarise(Total.Deaths = sum(new_deaths, na.rm = T),
         Total.Cases = sum(new_cases, na.rm = T),
         Mean.cases.per.day = round(mean(new_cases, na.rm =T),2),
         Mean.deaths.per.day = round(mean(new_deaths, na.rm =T),2))

covid_summary_statistics %>%
  select(location, Year, Month, Total.Cases, Total.Deaths, Mean.cases.per.day, Mean.deaths.per.day) %>%
  datatable(
    rownames = F,
    class = "cell-border stripe",
    colnames = c("Country", "Year", "Month", "Total cases", "Total Deaths", "Mean cases per day", "Mean Deaths per day"),
    caption = "Country wise COVID-19 Cases and Deaths",
    options = list(columnDefs = list(list(className = "dt-center", targets = 0:1)))
  )

3.2 Descriptive Statistics

descriptive.stat <- covid %>%
  select(new_cases, new_deaths, total_deaths, total_cases, tests_per_case)

summary(descriptive.stat)
##    new_cases         new_deaths        total_deaths      total_cases       
##  Min.   :      0   Min.   :    0.00   Min.   :      0   Min.   :        0  
##  1st Qu.:      0   1st Qu.:    0.00   1st Qu.:     26   1st Qu.:     2521  
##  Median :     17   Median :    0.00   Median :    499   Median :    36415  
##  Mean   :   2789   Mean   :   28.33   Mean   :  17365   Mean   :  1216952  
##  3rd Qu.:    482   3rd Qu.:    5.00   3rd Qu.:   5835   3rd Qu.:   390794  
##  Max.   :1354499   Max.   :59895.00   Max.   :1108482   Max.   :102345676  
##  tests_per_case     
##  Min.   :      0.0  
##  1st Qu.:      0.0  
##  Median :      0.0  
##  Mean   :    945.3  
##  3rd Qu.:     10.6  
##  Max.   :1023631.9

3.3 Heatmaps

3.3.1 Heatmap of total COVID infections

#latest day
day_latest <- max(covid$date)

#creating heatmaps
covid.cases <- covid %>%
  group_by(location) %>%
  filter(date == max(date))

#creating covid cases heat maps
line <- list(color = toRGB("#d1d1d1"), width = 0.4)
heatmap <- list(
  showframe = F,
  showcoastlines = F,
  projection = list(type = "orthographic"),
  resolution = "100",
  showcountries = T,
  countrycolor = "#d1d1d1",
  showocean = T,
  oceancolor = '#064273',
  showlakes = T,
  lakecolor = '#99c0db',
  showrivers = T,
  rivercolor = '#99c0db',
  bgcolor = '#e8f7fc')

plot_geo() %>%
  layout(geo = heatmap,
         paper_bgcolor = '#e8f7fc',
         title = paste0("World COVID-19 Confirmed Cases till ", day_latest)) %>%
  add_trace(data = covid.cases,
            z = ~total_cases,
            colors = "Reds",
            text = ~location,
            locations = ~iso_code,
            marker = list(line = line))

3.3.2 Heatmap of total COVID deaths

##Heatmap for covid deaths
plot_geo() %>%
  layout(geo = heatmap,
         paper_bgcolor = '#e8f7fc',
         title = paste0("World COVID-19 deaths till ",  day_latest)) %>%
  add_trace(data = covid.cases,
            z = ~total_deaths,
            colors = "Reds",
            text = ~location,
            locations = ~iso_code,
            marker = list(line = line))

3.4 Trend of total COVID cases and deaths

#Trend of world covid cases and deaths
covid %>%
  group_by(date) %>%
  filter(date != day_latest) %>%
  dplyr::summarise(total_deaths = sum(total_deaths, na.rm = T), 
                   total_cases = sum(total_cases, na.rm = T), .groups = "drop") %>%
  ggplot(aes(x = date)) +
  geom_line(aes(y = total_cases + 1), color = "#2e9449", linewidth = 1.5) +
  geom_line(aes(y = total_deaths + 1), linewidth = 1.5, linetype = 2, color = "#9c2742") +
  scale_y_continuous(trans = "log10", labels = comma) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  labs(title = "Global COVID infections and deaths",
       subtitle = paste0("Till ", day_latest - 1),
       x = "",
       y = "Log10 transformation") +
  theme_apa() +
  theme(axis.text.x = element_text(angle = 90, color = "black", hjust = 1),
        axis.text = element_text(color = "black")) +
  geom_vline(xintercept = as.Date("2020-03-11"), linetype = "longdash", linewidth = 0.8, col = "black") +
  annotate("text", x = as.Date("2020-03-10"), y = 11100, label = "WHO announces pandemic \n", size = 4.2, angle = 90) +
  geom_vline(xintercept = as.Date("2020-01-30"), linetype = "longdash", linewidth = 0.8, col = "black") +
  annotate("text", x = as.Date("2020-01-20"), y = 16100, label = "Global health emergency declared \n", size = 4.2, angle = 90) +
  annotate("text", x = as.Date("2021-05-05"), y = 1000000, label = "Total Deaths \n", size = 4.2) +
  annotate("text", x = as.Date("2021-05-05"), y = 50000000, label = "Total Cases \n", size = 4.2)

3.5 Trend of new COVID cases and deaths

3.5.1 Trend of new COVID cases

p1 <- covid %>%
  group_by(date, continent) %>%
  dplyr::summarise(new_covid_cases = sum(new_cases, na.rm = T), .groups = "drop") %>%
  ggplot(aes(date)) +
  geom_col(aes(y = new_covid_cases, color = continent)) +
  labs(
    title = "Trend of New COVID-19 cases in different continents",
    subtitle = paste0("Till ", day_latest - 1),
    y = "",
    x = ""
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90, hjust = 0.6)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") +
  facet_wrap(~continent)
p1

3.5.2 Trend of new COVID deaths

p2 <- covid %>%
  group_by(date, continent) %>%
  dplyr::summarise(new_covid_deaths = sum(new_deaths, na.rm = T)) %>% 
  ggplot(aes(date)) +
  geom_col(aes(y = new_covid_deaths, color = continent)) +
  labs(
    title = "Trend of New COVID-19 deaths in different continents",
    subtitle = paste0("Till ", day_latest - 1),
    y = "",
    x = ""
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90, hjust = 0.6)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  facet_wrap(~continent)
p2

3.6 COVID cases in different months in different continents

#COVID cases by months
month.df <- year.month %>%
  group_by(Month, continent) %>%
  dplyr::summarise(total.cases = sum(new_cases, na.rm = T),
                   total.deaths = sum(new_deaths, na.rm = T))

month.df$Month <- factor(month.df$Month, levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

p3<-ggplot(month.df, aes(x = Month, y = total.cases, fill = continent)) +
  geom_col(position = "dodge", color = "black") +
  # facet_wrap(~continent) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.2),
        axis.text = element_text(size = 9, color = "black")) +
  labs(
    x = "",
    y = "Total Cases",
    title = "COVID - 19 cases in different continents in different months",
    subtitle = paste0("Till ", day_latest-0)
  ) +
  scale_y_continuous(label = comma) +
  coord_flip()
ggplotly(p3)

3.7 Countries with most COVID cases and deaths

3.7.1 Top Ten countries with most COVID infections

top.10.covid.countries <- covid %>%
  select(location, new_cases, date) %>%
  group_by(location) %>%
  dplyr::summarise(total.cases = sum(new_cases, na.rm = T)) %>%
  top_n(10, total.cases) %>%
  arrange(desc(total.cases)) %>%
  mutate(country.reordered = fct_reorder(location, total.cases))

p4 <- top.10.covid.countries %>%
  ggplot(aes(country.reordered, total.cases)) +
  geom_col() +
  geom_text(aes(label=total.cases), hjust = -0.1, size = 3) +
  scale_y_continuous(label = comma) +
  labs(
    x = "Total Cases",
    y = "",
    title = "Top 10 countries with highest COVID-19 infections",
    subtitle = paste0("Till ", day_latest-1)
  ) +
  theme_bw() +
  coord_flip()
p4

3.7.2 Top Ten countries with most COVID deaths

top.10.covid.deaths.countries <- covid %>%
  select(date, location, new_deaths) %>%
  group_by(location) %>%
  dplyr::summarise(total.deaths = sum(new_deaths, na.rm = T)) %>%
  top_n(10, total.deaths) %>%
  arrange(desc(location)) %>%
  mutate(country.reordered = fct_reorder(location, total.deaths))
p5 <- top.10.covid.deaths.countries %>%
  ggplot(aes(country.reordered, total.deaths)) +
  geom_col() +
  geom_text(aes(label = total.deaths), hjust = -0.1, size = 3) +
  scale_y_continuous(label = comma) +
  labs(x = "",
       y = "Total Deaths",
       title = "Top 10 countries with most COVID-19 deaths",
       subtitle = paste0("Till ", day_latest-1)) +
  theme_bw() +
  coord_flip()
p5

3.8 Bubble Charts

3.8.1 Bubble chart of total COVID cases of different countries

p6 <- covid %>%
  filter(date == max(day_latest - 1)) %>%
  select(total_cases, human_development_index, location, continent, population) %>%
  mutate(location = factor(location)) %>%
  ggplot(aes(human_development_index, total_cases/1000, size = population, color = continent)) +
  geom_point(aes(text = location), alpha = 0.5) +
  scale_size(range=c(1,10), name = "") +
  theme_bw() +
  labs(
    x = "Human Development Index",
    y = "Total cases in thousands"
  )
   
ggplotly(p6, tooltip = c("text", "total_cases", "human_development_index")) %>%
  layout(title = list(text = paste0('Total Covid cases in different countries',
                                    '<br>',
                                    '<sup>',
                                    'Size of bubble denotes the population','</sup>')),
         margin = list(t = 70))
  • United States of America has the highest COVID infections.
  • Most of the countries with high COVID cases are European nations with few Asian.
  • The bubble chart shows that countries which score high in HDI have more covid cases than compared countries which score low in HDI. This may be because of the large number of travels by people to and from these countries i.e. countries with high HDI.

3.8.2 Bubble chart of total COVID deaths of different countries

p7 <- covid %>%
  filter(date == max(day_latest - 1)) %>%
  select(total_deaths, human_development_index, location, continent, population) %>%
  mutate(location = factor(location)) %>%
  ggplot(aes(human_development_index, total_deaths/1000, size = population, color = continent)) +
  geom_point(aes(text = location), alpha = 0.5) +
  scale_size(range=c(1,10), name = "") +
  theme_bw() +
  labs(
    x = "Human Development Index",
    y = "Total deaths in thousands"
  )
 
ggplotly(p7, tooltip = c("text", "total_deaths", "human_development_index")) %>%
  layout(title = list(text = paste0('Total Covid deaths in different countries',
                                    '<br>',
                                    '<sup>',
                                    'Size of scatter plot denotes the covid deaths','</sup>')),
         margin = list(t = 70))
  • The highest COVID related deaths is in US followed by Brazil.
  • The COVID related deaths is high in countries with high HDI.This is either due to (i) high COVID cases in those countries, (ii) large number of old age people (iii) large number of people with chronic health condition in those countries.

3.8.3 Bubble chart of COVID infection fatality rate of different countries

p8 <- covid %>%
  filter(date == max(day_latest - 1),
         location != "North Korea") %>%
  select(total_deaths, total_cases, location, continent) %>%
  mutate(location = factor(location),
         infection_fatality_rate = (total_deaths/total_cases)*100) %>%
  ggplot(aes(total_cases/1000, total_deaths/1000, size = infection_fatality_rate, color = continent)) +
  geom_point(aes(text = location), alpha = 0.45) +
  scale_size(range=c(1,10), name = "") +
  theme_bw() +
  labs(
    x = "Total cases in thousands",
    y = "Total deaths in thousands"
  )

ggplotly(p8, tooltip = "all") %>%
  layout(title = list(text = paste0('Total Covid cases vs total deaths in different countries',
                                    '<br>',
                                    '<sup>',
                                    'Size of bubble is based on infection fatality rate','<br>',
                                    'Infection mortality rate vary depending on fairness of reported covid cases and deaths', '</sup>')),
         margin = list(t = 100))

3.9 Correlation Matrix

covid.data.corr <- covid %>%
  select(new_cases, new_deaths, tests_per_case, new_tests, reproduction_rate, icu_patients, hosp_patients, human_development_index, gdp_per_capita, extreme_poverty, male_smokers, female_smokers, handwashing_facilities)

#Plotting the correlation matrix
cor <- cor(covid.data.corr)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(cor, method = 'color', 
         type = "upper", #Displays only upper part of the matrix
         order = "hclust",
         col=col(200),
         addCoef.col = "black",  #Add coeffiecient of correlation
         number.cex = 0.8,
         tl.col="black",  #Text label color
         tl.srt=90,  #Text label rotation
         diag = FALSE,
         sig.level = 0.01, insig = "blank")

4. COVID in Nepal

4.1 Total COVID cases and deaths

covid.nepal <- covid %>%
  filter(location == "Nepal") %>%
  ggplot(aes(x = date)) +
  geom_line(aes(y = total_cases + 1), color = "#2e9449", linewidth = 1) +
  geom_line(aes(y = total_deaths + 1), linewidth = 1, linetype = 2, color = "#9c2742") +
  scale_y_continuous(trans = "log10", labels = comma) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  labs(title = "Global COVID infections and deaths",
       subtitle = paste0("Till ", day_latest - 1),
       x = "",
       y = "Log10 transformation") +
  theme_apa() +
  theme(axis.text.x = element_text(angle = 90, color = "black", hjust = 1),
        axis.text = element_text(color = "black")) +
  annotate("text", x = as.Date("2021-05-05"), y = 9000, label = "Total Deaths \n", size = 3.8) +
  annotate("text", x = as.Date("2021-05-05"), y = 450000, label = "Total Cases \n", size = 3.8)  
  
ggplotly(covid.nepal) %>%
  layout(title = list(text = paste0('Global COVID infections and deaths',
                                    '<br>',
                                    '<sup>',
                                    paste0('TIll ',day_latest-1),'</sup>')),
         margin = list(t = 50))

4.2 New COVID cases and deaths

4.2.1 New COVID cases

##data manipulation for trend of new cases and new deaths in nepal
covid.nepal1 <- covid %>%
  filter(location == "Nepal") %>%
  mutate(new.cases.smoothed  = as.integer(SMA(new_cases, n = 14)),
         new.deaths.smoothed = as.integer(SMA(new_deaths, n = 14))) %>%     #as.integer is used to convert numbers with decimals into integers
  select(date, new.cases.smoothed, new.deaths.smoothed) %>%
  filter(date > "2020-02-06")     #To remove rows with NAs induced due to smoothing (14 days simple moving average)

##trend of new covid cases in nepal
p9 <- covid.nepal1 %>%
  ggplot(aes(date, new.cases.smoothed)) +
  geom_line(linewidth = 1, color = "#2C8C33") +
  labs(x = "",
       y = "Number of new cases",
       title = "Trend of New Covid infections in Nepal (14-days smoothed)",
       subtitle = paste0("Till ", day_latest - 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  annotate(geom="text", x=as.Date("2022-03-15"), y=8289, 
           label="Per day new covid cases reached\nmaximum i.e. 8,589", size = 3.5) +
  annotate(geom="point", x=as.Date("2021-05-18"), y=8589, size=6, shape=21, fill="transparent")

p9

4.2.2 New COVID deaths

p10 <- covid.nepal1 %>%
  ggplot(aes(date, new.deaths.smoothed)) +
  geom_line(linewidth = 1, color = "#CF5C5C") +
  labs(x = "",
       y = "Number of new deaths",
       title = "Trend of New Covid related deaths in Nepal (14-days smoothed)",
       subtitle = paste0("Till ", day_latest - 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  annotate(geom="text", x=as.Date("2022-03-25"), y=185, 
           label="Per day new covid deaths reached\nmaximum i.e. 190", size = 3.5) +
  annotate(geom="point", x=as.Date("2021-05-24"), y=189, size=6, shape=21, fill="transparent")

p10

4.3 Polynomial Regression of new covid cases in Nepal

nepal.df <- covid %>%
  filter(location == "Nepal") %>%
  select(new_tests, new_cases, stringency_index)
plot(nepal.df$new_tests, nepal.df$new_cases)

This plot shows that new_tests and new_cases have strong positive correlation. But it seems new_tests has few outliers which can be removed before fitting a polynomial regression model.

#Identifying the outliers position in new_tests
boxplot(nepal.df$new_tests)

out <- boxplot.stats(nepal.df$new_tests)$out
out_ind <- which(nepal.df$new_tests %in% c(out))
#Removing the outliers
nepal.df1 <- nepal.df[-out_ind,]
#Regression model
model <- lm(new_cases ~ poly(new_tests,2) + stringency_index, data = nepal.df1)
summary(model)
## 
## Call:
## lm(formula = new_cases ~ poly(new_tests, 2) + stringency_index, 
##     data = nepal.df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5659.8  -289.4   -16.6   194.5  7900.5 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           302.365     64.764   4.669  3.4e-06 ***
## poly(new_tests, 2)1 35220.762    994.467  35.417  < 2e-16 ***
## poly(new_tests, 2)2 24667.004    889.493  27.732  < 2e-16 ***
## stringency_index       11.801      1.174  10.053  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 875.6 on 1097 degrees of freedom
## Multiple R-squared:  0.7241, Adjusted R-squared:  0.7234 
## F-statistic: 959.7 on 3 and 1097 DF,  p-value: < 2.2e-16
  • Both independent variables new_tests and stringency_index have significant impact on new_cases at all level of significance.

  • The polynomial regression model is \(Y = 303.967 + 35187.011X_1 + 24672.382{X_1}^2 + 11.814X_2 + Error\).

    Here \(X_1\) is new_tests and \(X_2\) is stringency index.

  • Adjusted R-squared is 72.3% indicating that these two predictor variables result in 72.3% variance in the dependent variable i.e., new_cases and rest 27.7% by other factors not in this regression model.

residplot(model)

effect_plot(model, pred = new_tests, interval = T, plot.points = T, line.colors = "#4669E8")